package com.fastflow.fastflow.controller;

import com.xxl.crawler.XxlCrawler;
import com.xxl.crawler.pageloader.param.Response;
import com.xxl.crawler.pageparser.PageParser;
import org.jsoup.nodes.Document;

public class TestMain {

    public static void main(String[] args) {
        XxlCrawler crawler = new XxlCrawler.Builder()
                // 设置爬虫入口URL
                .setUrls("https://gitee.com/explore/all?order=starred&page=1")
                // 允许爬虫扩散
                .setAllowSpread(true)
                // 设置爬虫扩散的URL白名单正则表达式，控制扩散范围
                .setWhiteUrlRegexs("^https:\\/\\/gitee\\.com\\/explore\\/all\\?order=starred&page=[1-5]$")
                // 设置爬虫线程池大小
                .setThreadCount(3)
                // 设置爬虫每次抓取间隔时间，避免对下游压力过大
                .setPauseMillis(100)
                .setPageParser(new PageParser() {

                    @Override
                    public void afterParse(Response response) {
                        Document html = response.getHtml();
                        System.out.println(html.text());
                    }

                })
                .build();

        crawler.start(true);
    }

}
